/*
 *    DocumentTagger.java
 *    Copyright (C) 2007 David Milne, d.n.milne@gmail.com
 *
 *    This program is free software; you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation; either version 2 of the License, or
 *    (at your option) any later version.
 *
 *    This program is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with this program; if not, write to the Free Software
 *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */


package org.wikipedia.miner.annotation.tagging;


import gnu.trove.map.hash.TIntDoubleHashMap;

import java.util.*;

import org.wikipedia.miner.annotation.* ;
import org.wikipedia.miner.annotation.preprocessing.* ;
import org.wikipedia.miner.util.*;

/**
 * Tags documents by adding markup to to the topics it mentions. 
 * <p>
 * You can use this to tag all topics by giving it the output of the TopicDetector, or 
 * just those that should be linked to (from LinkDetector) or those that the document 
 * is about (from Indexers). If using the latter two, then you should first modify the list 
 * of topics to include only those that are most likely to be links or key topics 
 * (i.e those above a certain weight). 
 * 
 * @author David Milne
 */
public abstract class DocumentTagger {

	/**
	 * Options for tagging or ignoring repeat mentions of topics
	 */
	public enum RepeatMode {

		/**
		 * All mentions of a topic will be tagged
		 */
		ALL, 
		/**
		 * Only the first mention of a topic will be tagged
		 */
		FIRST, 
		/**
		 * Only the first mention within each region (e.g. DIV) will be tagged
		 */
		FIRST_IN_REGION
	} ;	

	/**
	 * Specifies how terms in the document will be replaced by tags. A tagger for html, for example, might return a link to the relevant Wikipedia article.
	 * 
	 * @param term the text in the original document that will be tagged.
	 * @param topic the relevant topic that the term was disambiguated to.
	 * @return the tag that will replace the given term.
	 */
	public abstract String getTag(String term, Topic topic) ;

	/**
	 * Tags the given text with occurrences of the given topics. 
	 * 
	 * @param doc the document to be tagged
	 * @param topics a set of automatically detected topics, i.e. from TopicDetector or LinkDetector
	 * @param repeatMode ALL, FIRST, or FIRST_IN_REGION 
	 * @return the tagged text
	 */
	public String tag(PreprocessedDocument doc, Collection<Topic> topics, RepeatMode repeatMode) {

		doc.resetRegionTracking() ;

		HashMap<Integer,Topic> topicsById = new HashMap<Integer, Topic>() ;
		for (Topic topic: topics) 
			topicsById.put(topic.getId(), topic) ;

		ArrayList<TopicReference> references = resolveCollisions(topics) ;

		String originalText = doc.getOriginalText() ;
		StringBuffer wikifiedText = new StringBuffer() ;
		int lastIndex = 0 ;

		HashSet<Integer> doneIds = new HashSet<Integer>() ;

		for (TopicReference reference:references) {
			int start = reference.getPosition().getStart() ; 
			int end = reference.getPosition().getEnd() ;
			int id = reference.getTopicId() ;

			Topic topic = topicsById.get(id) ;	

			//System.out.println("considering tagging " + topic + " at " + reference.getPosition()) ;

			if (repeatMode == RepeatMode.FIRST_IN_REGION)
				doneIds = doc.getDoneIdsInCurrentRegion(start) ;

			if (topic != null && (repeatMode == RepeatMode.ALL || !doneIds.contains(id))) {

				doneIds.add(id) ;
				wikifiedText.append(originalText.substring(lastIndex, start)) ;
				wikifiedText.append(getTag(originalText.substring(start, end), topic)) ;

				lastIndex = end ;

				//System.out.println(" - tagged") ;
			}
		}

		wikifiedText.append(originalText.substring(lastIndex)) ;
		return wikifiedText.toString() ;
	}
	
	private ArrayList<TopicReference> resolveCollisions(Collection<Topic> topics) {
		

		//build up a list of topic references and hashmap of topic weights
		ArrayList<TopicReference> references = new ArrayList<TopicReference>() ;
		TIntDoubleHashMap topicWeights = new TIntDoubleHashMap() ;
		
		for(Topic topic: topics) {	
			for (Position pos: topic.getPositions()) {
				topicWeights.put(topic.getId(), topic.getWeight()) ;

				TopicReference tr = new TopicReference(null, topic.getId(), pos) ;
				references.add(tr) ;
			}
		}
		//sort references
		Collections.sort(references) ;
		
		
		for (int i=0 ; i<references.size(); i++) {
			TopicReference outerRef = references.get(i) ;

			
			//identify weight of this reference
			double outerWeight = topicWeights.get(outerRef.getTopicId());

			//identify references overlapped by this one, and their total weight
			Vector<TopicReference> innerReferences = new Vector<TopicReference>() ;
			double maxInnerWeight = 0 ;
			for (int j=i+1 ; j<references.size(); j++){
				TopicReference innerRef = references.get(j) ;

				if (outerRef.overlaps(innerRef)) {
					innerReferences.add(innerRef) ;	
					
					double innerWeight = topicWeights.get(innerRef.getTopicId());
					if (innerWeight > maxInnerWeight)
						maxInnerWeight = innerWeight ;
				} else {
					break ;
				}
			}

			if ((maxInnerWeight*0.8) > outerWeight) {
				// want to keep the inner references
				references.remove(i) ;
				i = i-1 ;				
			} else {
				//want to keep the outer reference
				for (int j=0 ; j<innerReferences.size() ; j++) {
					references.remove(i+1) ;
				}
			}
		}
		
		return references ;
	}

	/*
	private Vector<TopicReference> resolveCollisions(Collection<Topic> topics) {

		TIntDoubleHashMap topicWeights = new TIntDoubleHashMap() ;
		

		TreeSet<TopicReference> temp = new TreeSet<TopicReference>() ;

		for(Topic topic: topics) {	
			for (Position pos: topic.getPositions()) {
				topicWeights.put(topic.getId(), topic.getWeight()) ;

				TopicReference tr = new TopicReference(null, topic.getId(), pos) ;
				temp.add(tr) ;
			}
		}

		Vector<TopicReference> references = new Vector<TopicReference>() ;
		references.addAll(temp) ;

		for (int i=0 ; i<references.size(); i++) {
			TopicReference reference = references.elementAt(i) ;

			Vector<TopicReference> overlappedTopics = new Vector<TopicReference>() ;

			for (int j=i+1 ; j<references.size(); j++){
				TopicReference reference2 = references.elementAt(j) ;

				if (reference.overlaps(reference2)) 
					overlappedTopics.add(reference2) ;
			}

			for (int j=0 ; j<overlappedTopics.size() ; j++) {
				references.removeElementAt(i+1) ;
			}
			
			/*
			
			//TODO: why is all of this blanked out??

			double refWeight = 0 ;
			Integer refId = reference.getTopicId() ;

			if (topicWeights.containsKey(refId))
				refWeight = topicWeights.get(refId) ;
			
			double overlapWeight = 0 ;

			for (int j=i+1 ; j<references.size(); j++){
				TopicReference reference2 = references.elementAt(j) ;

				if (reference.overlaps(reference2)) {
					//System.out.println("--" + getNGram(words, c.getStartIndex(), c.getEndIndex()) + " overlaps " + getNGram(words, c1.getStartIndex(), c1.getEndIndex()));
					overlappedTopics.add(reference2) ;

					double ref2Weight = 0 ;
					Integer ref2Id = reference2.getTopicId() ;
					if (topicWeights.containsKey(ref2Id))
						ref2Weight = topicWeights.get(ref2Id) ;

					overlapWeight = overlapWeight + ref2Weight ; 
				} else {
					break ;
				}
			}

			if (overlappedTopics.size() > 0)
				overlapWeight = overlapWeight / overlappedTopics.size() ;

			if (overlapWeight > refWeight) {
				// want to keep the overlapped items
				references.removeElementAt(i) ;
				i = i-1 ;				
			} else {
				//want to keep the overlapping item
				for (int j=0 ; j<overlappedTopics.size() ; j++) {
					references.removeElementAt(i+1) ;
				}
			}
			
		}
		
		return references ;
	}*/
}
